1. Setup

knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)
# set working directory
setwd("C:/Users/vo_j4/Desktop/github data")
# load  datasets
IGM_data <-read.csv("GDIM_2023_03.csv")
gini_data <- read.csv("gini 1991-2017 feb 22, 2024.csv")

# load packages
library(leaps)
library(dplyr)
library(ggplot2)
library(ggforce)
library(stats)
library(ggalt)
library(plot3D)
library(plotly)
library(gridExtra) 
library(cluster)
library(factoextra)
library(caret)

2. Cleaning data

2.1. Gini dataset

head(gini_data)
# Rename the variables
colnames(gini_data)[colnames(gini_data) == "X1991..YR1991."] <- "1991"
colnames(gini_data)[colnames(gini_data) == "X1992..YR1992."] <- "1992"
colnames(gini_data)[colnames(gini_data) == "X1993..YR1993."] <- "1993"
colnames(gini_data)[colnames(gini_data) == "X1994..YR1994."] <- "1994"
colnames(gini_data)[colnames(gini_data) == "X1995..YR1995."] <- "1995"
colnames(gini_data)[colnames(gini_data) == "X1996..YR1996."] <- "1996"
colnames(gini_data)[colnames(gini_data) == "X1997..YR1997."] <- "1997"
colnames(gini_data)[colnames(gini_data) == "X1998..YR1998."] <- "1998"
colnames(gini_data)[colnames(gini_data) == "X1999..YR1999."] <- "1999"
colnames(gini_data)[colnames(gini_data) == "X2000..YR2000."] <- "2000"
colnames(gini_data)[colnames(gini_data) == "X2001..YR2001."] <- "2001"
colnames(gini_data)[colnames(gini_data) == "X2002..YR2002."] <- "2002"
colnames(gini_data)[colnames(gini_data) == "X2003..YR2003."] <- "2003"
colnames(gini_data)[colnames(gini_data) == "X2004..YR2004."] <- "2004"
colnames(gini_data)[colnames(gini_data) == "X2005..YR2005."] <- "2005"
colnames(gini_data)[colnames(gini_data) == "X2006..YR2006."] <- "2006"
colnames(gini_data)[colnames(gini_data) == "X2007..YR2007."] <- "2007"
colnames(gini_data)[colnames(gini_data) == "X2008..YR2008."] <- "2008"
colnames(gini_data)[colnames(gini_data) == "X2009..YR2009."] <- "2009"
colnames(gini_data)[colnames(gini_data) == "X2010..YR2010."] <- "2010"
colnames(gini_data)[colnames(gini_data) == "X2011..YR2011."] <- "2011"
colnames(gini_data)[colnames(gini_data) == "X2012..YR2012."] <- "2012"
colnames(gini_data)[colnames(gini_data) == "X2013..YR2013."] <- "2013"
colnames(gini_data)[colnames(gini_data) == "X2014..YR2014."] <- "2014"
colnames(gini_data)[colnames(gini_data) == "X2015..YR2015."] <- "2015"
colnames(gini_data)[colnames(gini_data) == "X2016..YR2016."] <- "2016"
colnames(gini_data)[colnames(gini_data) == "X2017..YR2017."] <- "2017"

# empty NAs cells instead of ",,"
columns_to_process <- colnames(gini_data)[5:ncol(gini_data)] ## specify the columns to be processed (from "1991" to "2017")

for (col in columns_to_process) { ## loop through each specified column and replace ".." with ""
  gini_data[gini_data[[col]] == "..", col] <- ""
  gini_data[[col]] <- as.numeric(gini_data[[col]])
}

2.2. IGM dataset

library(dplyr)

# group by country and calculate the mean for numerical variables in columns 16 to 90
numerical <- IGM_data %>%
  group_by(country) %>%
  summarize(across(15:89, ~mean(., na.rm = TRUE)))

# transfer over categorical variables in columns 1 to 12 because they're all identical
categorical <- IGM_data  %>%
  group_by(country) %>%
  summarize(across(0:11, ~first(.)))

# merge 2 datasets into df
df <- merge( categorical,numerical, by = "country")
 
# add a new variable "gini" to df
df$gini <- NA

for (i in 1:nrow(df)) {
  # extract year, country, code columns
  country_code <- df$code[i]
  temp_year <- as.character(df$year[i])
  
  # find corresponding column in gini_data
  col_name <- temp_year  # Assuming column names are just the years
  
  # subset gini_data based on country code
  matching_rows <- gini_data$Country.Code == country_code
  
  # check if there are any matching rows
  if (any(matching_rows)) {
    # match the Country.Code and assign the value to "gini"
    df$gini[i] <- gini_data[matching_rows, col_name]
  } else {
    # if no matching rows, assign NA
    df$gini[i] <- "none"
  }
}

df$gini
##   [1] NA     "33.7" "42.7" "41.1" "32.5" NA     "30.8" NA     "32.1" "25.3"
##  [11] "27.6" "43.4" "40.9" "50.8" NA     "53.3" "52"   "36"   "39.8" "38.6"
##  [21] "47.2" NA     "42.8" "33.2" "56.2" "43.3" "45.8" "42.2" "52.6" "45.3"
##  [31] "42.1" "48.9" "48.3" "43.2" "30.9" "34.3" "25.4" "28.4" "41.6" "42.2"
##  [41] "46.9" "28.3" "38"   "31.2" "51.5" NA     "40.4" "27.1" "31.9" "38"  
##  [51] "35.9" "36.6" "31.4" "42.4" "35"   "48.3" "43"   "50.7" "41.1" "49.4"
##  [61] "30.3" "27.2" "35.7" "40.2" "38.8" "29.5" "32.8" "39"   "35.2" NA    
##  [71] "33.7" "27.2" NA     "37"   "31.2" "26.7" "26.8" "36"   "34.3" "31.8"
##  [81] "44.9" "33.2" "38.4" "42.6" NA     "41.1" "38.4" NA     "37.7" "38.5"
##  [91] NA     "26.3" "32.3" "38.5" "40.7" "45.6" "38.1" "59.1" NA     "28.2"
## [101] NA     NA     "34.3" "35.5" "34.5" "28.5" NA     "52.7" "41.9" "48.5"
## [111] "43.1" NA     "31.2" "35.2" "34.4" "36.8" "48.5" "30.8" "40.3" "38.8"
## [121] "34"   "26.1" "24.8" NA     "63"   "46.3" "35.8" "38.7" "35.4" "29.6"
## [131] "33"   "none" NA     NA     "39.3" "27.8" "43.1" "37.5" NA     "41.9"
## [141] "39.1" NA     "24.7" "33.1" "41.2" "39.5" NA     "37.4" NA     "35.6"
## [151] "34.4" "36.7" "52"
# quick look at gini
summary(df$gini)
##    Length     Class      Mode 
##       153 character character

2.2.1. Gini NAs processing

# see which countries had NA value and which year that was
na_countries <- data.frame(cbind(df[is.na(df$gini), c("country", "year")]))
na_countries
# after manually select the appropriate value for each of those NAs,add them to our dataset 
# reference Methods section and README file for further details

NA_countries <- c("Afghanistan", "Australia", "Azerbaijan", "Bosnia and Herzegovina", "Cambodia", "Ethiopia", "Japan", "Kenya", "Malawi", "Mali", "Mexico", "Nepal", "New Zealand", "Nicaragua", "Pakistan", "Philippines", "Solomon Islands", "Taiwan, China", "Tajikistan", "Tanzania", "Tunisia", "Uganda", "Uzbekistan", "Venezuela, RB")
NA_gini <- c(31.6, 32.3, 26.6, 33, 30.76, 35, 32.9, 40.8, 45.5, 33, 47.2, 32.8, 36.2, 46.2, 28.7, 47.7, 37.1,33.8, 34, 37.8, 32.8, 41, 31.2, 39)
# match country names with corresponding gini values
for (i in 1:length(NA_countries)) { 
  df$gini[df$country == NA_countries[i]] <- NA_gini[i]
}
df$gini <- as.numeric(df$gini)

3. EDA

# checking correlation between all BH variables
cor(df[47:58])
##                     BHQ1_randomtiebreak    BHQ1_lb     BHQ1_ub
## BHQ1_randomtiebreak          1.00000000  0.8454426  0.07490105
## BHQ1_lb                      0.84544256  1.0000000 -0.40357163
## BHQ1_ub                      0.07490105 -0.4035716  1.00000000
## BHQ2_randomtiebreak          0.29824233  0.2037934  0.08968521
## BHQ2_lb                      0.51270699  0.8590112 -0.73658963
## BHQ2_ub                     -0.50421392 -0.8366010  0.75558397
## BHQ3_randomtiebreak         -0.84575565 -0.7028628 -0.09771206
## BHQ3_lb                      0.42063623  0.7531698 -0.74055337
## BHQ3_ub                     -0.77857303 -0.9401619  0.46896606
## BHQ4_randomtiebreak         -0.81292607 -0.6657970 -0.08069578
## BHQ4_lb                      0.20224092  0.4974151 -0.67837860
## BHQ4_ub                     -0.87037047 -0.8079332  0.05811190
##                     BHQ2_randomtiebreak    BHQ2_lb    BHQ2_ub
## BHQ1_randomtiebreak          0.29824233  0.5127070 -0.5042139
## BHQ1_lb                      0.20379338  0.8590112 -0.8366010
## BHQ1_ub                      0.08968521 -0.7365896  0.7555840
## BHQ2_randomtiebreak          1.00000000  0.2309542  0.1253588
## BHQ2_lb                      0.23095416  1.0000000 -0.8972863
## BHQ2_ub                      0.12535879 -0.8972863  1.0000000
## BHQ3_randomtiebreak         -0.36135509 -0.4253365  0.3612155
## BHQ3_lb                     -0.09426237  0.8825828 -0.9550175
## BHQ3_ub                     -0.17269690 -0.8761475  0.8823016
## BHQ4_randomtiebreak         -0.71718143 -0.4677107  0.2849251
## BHQ4_lb                     -0.36429043  0.6282982 -0.8185536
## BHQ4_ub                     -0.59400394 -0.6242911  0.4790428
##                     BHQ3_randomtiebreak     BHQ3_lb    BHQ3_ub
## BHQ1_randomtiebreak         -0.84575565  0.42063623 -0.7785730
## BHQ1_lb                     -0.70286277  0.75316985 -0.9401619
## BHQ1_ub                     -0.09771206 -0.74055337  0.4689661
## BHQ2_randomtiebreak         -0.36135509 -0.09426237 -0.1726969
## BHQ2_lb                     -0.42533651  0.88258275 -0.8761475
## BHQ2_ub                      0.36121554 -0.95501754  0.8823016
## BHQ3_randomtiebreak          1.00000000 -0.21675476  0.6941948
## BHQ3_lb                     -0.21675476  1.00000000 -0.8297685
## BHQ3_ub                      0.69419481 -0.82976847  1.0000000
## BHQ4_randomtiebreak          0.58650480 -0.29295825  0.5770133
## BHQ4_lb                     -0.23982120  0.78868297 -0.6987643
## BHQ4_ub                      0.64124433 -0.48468645  0.7410092
##                     BHQ4_randomtiebreak     BHQ4_lb     BHQ4_ub
## BHQ1_randomtiebreak         -0.81292607  0.20224092 -0.87037047
## BHQ1_lb                     -0.66579698  0.49741507 -0.80793317
## BHQ1_ub                     -0.08069578 -0.67837860  0.05811190
## BHQ2_randomtiebreak         -0.71718143 -0.36429043 -0.59400394
## BHQ2_lb                     -0.46771074  0.62829819 -0.62429114
## BHQ2_ub                      0.28492515 -0.81855355  0.47904275
## BHQ3_randomtiebreak          0.58650480 -0.23982120  0.64124433
## BHQ3_lb                     -0.29295825  0.78868297 -0.48468645
## BHQ3_ub                      0.57701334 -0.69876430  0.74100924
## BHQ4_randomtiebreak          1.00000000  0.12638517  0.96023901
## BHQ4_lb                      0.12638517  1.00000000 -0.08160403
## BHQ4_ub                      0.96023901 -0.08160403  1.00000000
ggplot(as.data.frame(as.table(cor(df[47:58]))), aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  theme_minimal() +
  labs(title = "BH Variables Correlation Heatmap", x = "", y = "")

# checking correlation between all TM variables
cor(df[63:87])
##             tm11         tm12         tm13          tm14        tm15
## tm11  1.00000000 -0.192668751 -0.549123110 -0.6837711482 -0.53637556
## tm12 -0.19266875  1.000000000 -0.162739052 -0.1542671357 -0.24399566
## tm13 -0.54912311 -0.162739052  1.000000000  0.0073007649  0.14703920
## tm14 -0.68377115 -0.154267136  0.007300765  1.0000000000  0.39694724
## tm15 -0.53637556 -0.243995665  0.147039199  0.3969472444  1.00000000
## tm21  0.88900218 -0.261230000 -0.436943848 -0.5930346174 -0.46827871
## tm22  0.27495025  0.720399237 -0.380058988 -0.4265771125 -0.38468978
## tm23 -0.35566401 -0.107416032  0.877386001 -0.1456100634 -0.02082919
## tm24 -0.67779560 -0.004532989  0.007239148  0.8946297617  0.36885240
## tm25 -0.29779725 -0.207520754 -0.033273027  0.2868576820  0.73671092
## tm31  0.84060840 -0.249399414 -0.440937462 -0.5252967002 -0.45906778
## tm32  0.53505474  0.418080046 -0.434948380 -0.5436615053 -0.43715553
## tm33 -0.08510869 -0.153871526  0.680180851 -0.2847389662 -0.17673760
## tm34 -0.60825509  0.077954886  0.027299101  0.7817299071  0.21585849
## tm35 -0.14808332  0.039741878 -0.032928849 -0.0334161351  0.53632580
## tm41  0.74380942 -0.190957881 -0.412638014 -0.4766317313 -0.38530014
## tm42  0.51640368  0.195707604 -0.330470559 -0.4936522945 -0.34316675
## tm43  0.19723799 -0.134398176  0.410160685 -0.4830180660 -0.21689899
## tm44 -0.51813867  0.072524705  0.015177891  0.7126880000  0.09715910
## tm45 -0.18815775  0.034068154  0.080297336 -0.0007170447  0.40154090
## tm51  0.68271292 -0.152852996 -0.397453237 -0.4292570209 -0.36858968
## tm52  0.58496607  0.078816756 -0.429421568 -0.4240138010 -0.34852914
## tm53  0.34626730 -0.098949615  0.166250744 -0.4773380155 -0.30907783
## tm54 -0.28151009  0.069732827 -0.020110545  0.4537792325 -0.07149219
## tm55 -0.47180422  0.052139616  0.204095525  0.2819801404  0.48931456
##              tm21        tm22        tm23         tm24        tm25        tm31
## tm11  0.889002181  0.27495025 -0.35566401 -0.677795596 -0.29779725  0.84060840
## tm12 -0.261230000  0.72039924 -0.10741603 -0.004532989 -0.20752075 -0.24939941
## tm13 -0.436943848 -0.38005899  0.87738600  0.007239148 -0.03327303 -0.44093746
## tm14 -0.593034617 -0.42657711 -0.14561006  0.894629762  0.28685768 -0.52529670
## tm15 -0.468278712 -0.38468978 -0.02082919  0.368852401  0.73671092 -0.45906778
## tm21  1.000000000  0.11279998 -0.33097968 -0.676304518 -0.35167165  0.87396046
## tm22  0.112799981  1.00000000 -0.27269379 -0.371649023 -0.36381178  0.11980013
## tm23 -0.330979681 -0.27269379  1.00000000 -0.180547912 -0.21724112 -0.30734039
## tm24 -0.676304518 -0.37164902 -0.18054791  1.000000000  0.24545329 -0.57209409
## tm25 -0.351671649 -0.36381178 -0.21724112  0.245453292  1.00000000 -0.33934797
## tm31  0.873960464  0.11980013 -0.30734039 -0.572094086 -0.33934797  1.00000000
## tm32  0.444573506  0.77018052 -0.31165620 -0.554585290 -0.33577669  0.39202326
## tm33  0.001492836 -0.07140452  0.75994192 -0.413312529 -0.32437743 -0.06509326
## tm34 -0.607570660 -0.29783185 -0.07694551  0.912657690  0.02464586 -0.53910874
## tm35 -0.212717581 -0.05167199 -0.15572473 -0.044798335  0.77690344 -0.33424709
## tm41  0.774520260  0.13730328 -0.26723534 -0.547575688 -0.27479792  0.87051875
## tm42  0.444337972  0.59618013 -0.24730151 -0.524597824 -0.27357423  0.45884572
## tm43  0.252595852  0.06446007  0.53518905 -0.599791436 -0.30218003  0.22126395
## tm44 -0.501308413 -0.21686862 -0.04097268  0.801598842 -0.13344450 -0.43137789
## tm45 -0.231542857 -0.12911678 -0.05865449  0.030946113  0.62844510 -0.35042026
## tm51  0.750786407  0.13556447 -0.32371901 -0.484544643 -0.25410238  0.71656721
## tm52  0.523567583  0.47380970 -0.35252887 -0.438862420 -0.26943473  0.51213840
## tm53  0.435647079  0.14828763  0.26335535 -0.590448036 -0.36161706  0.39899581
## tm54 -0.286708374 -0.07386091 -0.06894878  0.558449808 -0.24468997 -0.16962722
## tm55 -0.532071436 -0.23483382  0.12564218  0.306337574  0.56775276 -0.57185231
##            tm32         tm33        tm34        tm35        tm41        tm42
## tm11  0.5350547 -0.085108688 -0.60825509 -0.14808332  0.74380942  0.51640368
## tm12  0.4180800 -0.153871526  0.07795489  0.03974188 -0.19095788  0.19570760
## tm13 -0.4349484  0.680180851  0.02729910 -0.03292885 -0.41263801 -0.33047056
## tm14 -0.5436615 -0.284738966  0.78172991 -0.03341614 -0.47663173 -0.49365229
## tm15 -0.4371555 -0.176737605  0.21585849  0.53632580 -0.38530014 -0.34316675
## tm21  0.4445735  0.001492836 -0.60757066 -0.21271758  0.77452026  0.44433797
## tm22  0.7701805 -0.071404515 -0.29783185 -0.05167199  0.13730328  0.59618013
## tm23 -0.3116562  0.759941916 -0.07694551 -0.15572473 -0.26723534 -0.24730151
## tm24 -0.5545853 -0.413312529  0.91265769 -0.04479833 -0.54757569 -0.52459782
## tm25 -0.3357767 -0.324377432  0.02464586  0.77690344 -0.27479792 -0.27357423
## tm31  0.3920233 -0.065093255 -0.53910874 -0.33424709  0.87051875  0.45884572
## tm32  1.0000000 -0.067968996 -0.52532615 -0.11755072  0.36395132  0.76322286
## tm33 -0.0679690  1.000000000 -0.39022417 -0.30611586 -0.06017394 -0.01653481
## tm34 -0.5253262 -0.390224173  1.00000000 -0.23621390 -0.50933749 -0.51672972
## tm35 -0.1175507 -0.306115862 -0.23621390  1.00000000 -0.24403067 -0.11038990
## tm41  0.3639513 -0.060173942 -0.50933749 -0.24403067  1.00000000  0.41526891
## tm42  0.7632229 -0.016534813 -0.51672972 -0.11038990  0.41526891  1.00000000
## tm43  0.1957674  0.766959630 -0.59517644 -0.21051713  0.17557327  0.20644735
## tm44 -0.4097170 -0.238785411  0.90465796 -0.41125472 -0.40996725 -0.41241333
## tm45 -0.1990964 -0.206948274 -0.11417688  0.79469708 -0.39888368 -0.33310065
## tm51  0.3378027 -0.075457900 -0.42910252 -0.18349862  0.72609825  0.43061903
## tm52  0.6607518 -0.086414418 -0.41213068 -0.18383325  0.49095592  0.68040461
## tm53  0.3483652  0.582687309 -0.58258090 -0.30332533  0.35861435  0.35678119
## tm54 -0.2112327 -0.145036207  0.65272646 -0.50172406 -0.20401781 -0.18486343
## tm55 -0.4007454 -0.195914446  0.20696996  0.65245809 -0.52192641 -0.46776174
##             tm43         tm44          tm45        tm51        tm52        tm53
## tm11  0.19723799 -0.518138672 -0.1881577465  0.68271292  0.58496607  0.34626730
## tm12 -0.13439818  0.072524705  0.0340681538 -0.15285300  0.07881676 -0.09894962
## tm13  0.41016068  0.015177891  0.0802973361 -0.39745324 -0.42942157  0.16625074
## tm14 -0.48301807  0.712688000 -0.0007170447 -0.42925702 -0.42401380 -0.47733802
## tm15 -0.21689899  0.097159097  0.4015408998 -0.36858968 -0.34852914 -0.30907783
## tm21  0.25259585 -0.501308413 -0.2315428573  0.75078641  0.52356758  0.43564708
## tm22  0.06446007 -0.216868615 -0.1291167824  0.13556447  0.47380970  0.14828763
## tm23  0.53518905 -0.040972685 -0.0586544943 -0.32371901 -0.35252887  0.26335535
## tm24 -0.59979144  0.801598842  0.0309461132 -0.48454464 -0.43886242 -0.59044804
## tm25 -0.30218003 -0.133444503  0.6284451034 -0.25410238 -0.26943473 -0.36161706
## tm31  0.22126395 -0.431377891 -0.3504202565  0.71656721  0.51213840  0.39899581
## tm32  0.19576739 -0.409716997 -0.1990964110  0.33780274  0.66075180  0.34836519
## tm33  0.76695963 -0.238785411 -0.2069482741 -0.07545790 -0.08641442  0.58268731
## tm34 -0.59517644  0.904657958 -0.1141768832 -0.42910252 -0.41213068 -0.58258090
## tm35 -0.21051713 -0.411254717  0.7946970788 -0.18349862 -0.18383325 -0.30332533
## tm41  0.17557327 -0.409967254 -0.3988836849  0.72609825  0.49095592  0.35861435
## tm42  0.20644735 -0.412413329 -0.3331006464  0.43061903  0.68040461  0.35678119
## tm43  1.00000000 -0.506015574 -0.2903541659  0.05250221  0.12426093  0.74862831
## tm44 -0.50601557  1.000000000 -0.3835263055 -0.30708158 -0.29483798 -0.49727548
## tm45 -0.29035417 -0.383526305  1.0000000000 -0.28309076 -0.31803041 -0.29812653
## tm51  0.05250221 -0.307081584 -0.2830907588  1.00000000  0.49850926  0.17839715
## tm52  0.12426093 -0.294837981 -0.3180304080  0.49850926  1.00000000  0.30530846
## tm53  0.74862831 -0.497275477 -0.2981265291  0.17839715  0.30530846  1.00000000
## tm54 -0.27670540  0.746700844 -0.4652100734 -0.17363570 -0.09384303 -0.32151846
## tm55 -0.32149457  0.007023986  0.7052339317 -0.54923452 -0.62067363 -0.54777104
##             tm54         tm55
## tm11 -0.28151009 -0.471804221
## tm12  0.06973283  0.052139616
## tm13 -0.02011054  0.204095525
## tm14  0.45377923  0.281980140
## tm15 -0.07149219  0.489314561
## tm21 -0.28670837 -0.532071436
## tm22 -0.07386091 -0.234833825
## tm23 -0.06894878  0.125642182
## tm24  0.55844981  0.306337574
## tm25 -0.24468997  0.567752758
## tm31 -0.16962722 -0.571852312
## tm32 -0.21123271 -0.400745438
## tm33 -0.14503621 -0.195914446
## tm34  0.65272646  0.206969960
## tm35 -0.50172406  0.652458093
## tm41 -0.20401781 -0.521926413
## tm42 -0.18486343 -0.467761742
## tm43 -0.27670540 -0.321494566
## tm44  0.74670084  0.007023986
## tm45 -0.46521007  0.705233932
## tm51 -0.17363570 -0.549234516
## tm52 -0.09384303 -0.620673632
## tm53 -0.32151846 -0.547771044
## tm54  1.00000000 -0.384719975
## tm55 -0.38471997  1.000000000
ggplot(as.data.frame(as.table(cor(df[63:87]))), aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  theme_minimal() +
  labs(title = "TM Variables Correlation Heatmap", x = "", y = "")

4. K means clustering on BH variables - Validation

After running K-means clustering on the 12 BH variables, we perform 2 levels of validations:

4.1. K means clustering on BH variables

# create plot of number of clusters vs total within sum of squares
fviz_nbclust(scale(df[47:58]), kmeans, method = "wss")

# k-means with k = 4
set.seed(123)
km.res_BH <- kmeans(scale(df[47:58]), 4, nstart = 25)
km.res_BH
## K-means clustering with 4 clusters of sizes 42, 26, 63, 22
## 
## Cluster means:
##   BHQ1_randomtiebreak    BHQ1_lb     BHQ1_ub BHQ2_randomtiebreak    BHQ2_lb
## 1           1.0322240  1.0692290 -0.01700147           0.5074830  0.7745319
## 2          -1.5175469 -1.5718189  0.67182677          -0.5998219 -1.3974493
## 3          -0.1542744  0.1456727 -0.67353069          -0.3760416  0.4194482
## 4           0.2646407 -0.6008047  1.16722726           0.8168956 -1.0282680
##      BHQ2_ub BHQ3_randomtiebreak    BHQ3_lb    BHQ3_ub BHQ4_randomtiebreak
## 1 -0.5717331          -0.8511281  0.4924190 -0.8292556          -0.9697621
## 2  1.4589229           1.2061701 -1.4408292  1.7722619           1.3655536
## 3 -0.5886225           0.1550720  0.6157683 -0.3673223           0.3060670
## 4  1.0529095          -0.2446627 -1.0006109  0.5405104          -0.6389365
##       BHQ4_lb    BHQ4_ub
## 1 -0.01393797 -1.0345410
## 2 -1.15024817  1.5705506
## 3  0.81527670  0.1520359
## 4 -0.94866295 -0.3164480
## 
## Clustering vector:
##   [1] 4 1 1 1 1 3 1 3 4 4 1 2 2 4 3 1 4 1 2 3 3 1 1 3 1 2 1 3 1 2 3 1 3 4 1 3 3
##  [38] 3 2 3 1 2 4 3 1 2 3 3 3 2 2 1 3 4 1 4 2 4 4 4 1 3 4 4 3 2 1 1 1 3 3 3 4 3
##  [75] 3 3 3 2 3 1 3 2 3 1 2 1 2 2 3 3 1 3 3 1 2 1 1 1 2 3 3 4 2 3 1 3 4 1 3 1 1
## [112] 3 3 4 1 3 2 3 4 1 4 3 3 3 1 2 1 3 4 3 3 3 3 3 3 2 2 3 2 3 3 2 3 4 1 1 3 3
## [149] 3 1 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 130.6324 117.0119 300.9756  88.1987
##  (between_SS / total_SS =  65.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# append cluster assignment to df
df <- cbind(df, BH_cluster = factor(km.res_BH$cluster))
df$BH_cluster
##   [1] 4 1 1 1 1 3 1 3 4 4 1 2 2 4 3 1 4 1 2 3 3 1 1 3 1 2 1 3 1 2 3 1 3 4 1 3 3
##  [38] 3 2 3 1 2 4 3 1 2 3 3 3 2 2 1 3 4 1 4 2 4 4 4 1 3 4 4 3 2 1 1 1 3 3 3 4 3
##  [75] 3 3 3 2 3 1 3 2 3 1 2 1 2 2 3 3 1 3 3 1 2 1 1 1 2 3 3 4 2 3 1 3 4 1 3 1 1
## [112] 3 3 4 1 3 2 3 4 1 4 3 3 3 1 2 1 3 4 3 3 3 3 3 3 2 2 3 2 3 3 2 3 4 1 1 3 3
## [149] 3 1 3 3 3
## Levels: 1 2 3 4

4.2. Level 1 validation on K means clustering - PCA & visualization

# run PCA and take 2-3 1st PCs to visualize clusters
pca_BH <- prcomp(scale(df[47:58]))
summary(pca_BH)
## Importance of components:
##                           PC1    PC2     PC3     PC4    PC5     PC6    PC7
## Standard deviation     2.6575 1.7654 1.01322 0.67539 0.4450 0.27786 0.1624
## Proportion of Variance 0.5885 0.2597 0.08555 0.03801 0.0165 0.00643 0.0022
## Cumulative Proportion  0.5885 0.8482 0.93379 0.97180 0.9883 0.99473 0.9969
##                            PC8     PC9    PC10    PC11      PC12
## Standard deviation     0.13209 0.11705 0.06502 0.03819 5.114e-08
## Proportion of Variance 0.00145 0.00114 0.00035 0.00012 0.000e+00
## Cumulative Proportion  0.99838 0.99953 0.99988 1.00000 1.000e+00
# 1st 2 PCs make up 84.8% cumulative proportion; 1st  3 make up 93.4%

# plot clusters using 1st 2 PCs
pcaBH_df <- data.frame(PC1 = pca_BH$x[,1],  
                     PC2 = pca_BH$x[,2],
                     PC3 = pca_BH$x[,3],
                     BH_cluster = df$BH_cluster,
                     country = df$country)

BH_two_PCs <- ggplot(pcaBH_df, aes(x = PC1, y = PC2, color = BH_cluster)) +
  geom_point() +
  labs(title = "BH Clusters Against first 2 Principle Components",
       x = "Principle Component 1",
       y = "Principle Component 2",
       color = "Clusters based on BH variables",
       fill = "Clusters based on BH variables") +
  theme_minimal()
BH_two_PCs

# plot clusters using 1st 3 PCs
 BH_three_PCs <- plot_ly(data = pcaBH_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~factor(BH_cluster),
        type = "scatter3d", mode = "markers", marker = list(size = 5)) %>%
  layout(scene = list(xaxis = list(title = "Principle Component 1"),
                      yaxis = list(title = "Principle Component 2"),
                      zaxis = list(title = "Principle Component 3")),
         legend = list(title=list(text='<b> Clusters based on BH variables')),
         title = "3D Scatter Plot of PCA with BH Clusters")
BH_three_PCs

4.3. Level 2 validation on K means clustering - Descriptive using ANOVA and Posthocs

# CAT
catBH_anova <- aov(CAT ~ BH_cluster, data = df)
summary(catBH_anova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## BH_cluster    3  1.829  0.6096   25.03 3.57e-13 ***
## Residuals   149  3.628  0.0244                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
catBH_posthoc <- TukeyHSD(catBH_anova)
catBH_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = CAT ~ BH_cluster, data = df)
## 
## $BH_cluster
##             diff         lwr         upr     p adj
## 2-1 -0.288947221 -0.39012403 -0.18777041 0.0000000
## 3-1 -0.003397638 -0.08416531  0.07737004 0.9995308
## 4-1 -0.136455765 -0.24316251 -0.02974902 0.0061119
## 3-2  0.285549584  0.19103999  0.38005918 0.0000000
## 4-2  0.152491457  0.03503944  0.26994347 0.0051832
## 4-3 -0.133058127 -0.23346552 -0.03265073 0.0041181
# YOS
yosBH_anova <- aov(YOS ~ BH_cluster, data = df)
summary(yosBH_anova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## BH_cluster    3  1.283  0.4275   19.87 6.89e-11 ***
## Residuals   149  3.206  0.0215                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yosBH_posthoc <- TukeyHSD(yosBH_anova)
yosBH_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = YOS ~ BH_cluster, data = df)
## 
## $BH_cluster
##            diff        lwr          upr     p adj
## 2-1 -0.26828612 -0.3633943 -0.173177974 0.0000000
## 3-1 -0.04580024 -0.1217234  0.030122929 0.4003979
## 4-1 -0.10888827 -0.2091947 -0.008581886 0.0276226
## 3-2  0.22248588  0.1336450  0.311326720 0.0000000
## 4-2  0.15939785  0.0489907  0.269804996 0.0014226
## 4-3 -0.06308804 -0.1574729  0.031296847 0.3084581
# 1-COR
corBH_anova <- aov(1-COR ~ BH_cluster, data = df)
summary(corBH_anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## BH_cluster    3 0.8314  0.2772    46.2 <2e-16 ***
## Residuals   149 0.8938  0.0060                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
corBH_posthoc <- TukeyHSD(corBH_anova)
corBH_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = 1 - COR ~ BH_cluster, data = df)
## 
## $BH_cluster
##             diff         lwr         upr     p adj
## 2-1  0.157239020  0.10702133  0.20745670 0.0000000
## 3-1  0.159810949  0.11972305  0.19989885 0.0000000
## 4-1  0.031852488 -0.02110990  0.08481488 0.4031433
## 3-2  0.002571929 -0.04433658  0.04948044 0.9989644
## 4-2 -0.125386532 -0.18368218 -0.06709088 0.0000006
## 4-3 -0.127958461 -0.17779426 -0.07812266 0.0000000
# 1-BETA
betaBH_anova <- aov(1-BETA ~ BH_cluster, data = df)
summary(betaBH_anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## BH_cluster    3  2.001  0.6670   34.97 <2e-16 ***
## Residuals   149  2.842  0.0191                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
betaBH_posthoc <- TukeyHSD(betaBH_anova)
betaBH_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = 1 - BETA ~ BH_cluster, data = df)
## 
## $BH_cluster
##            diff         lwr          upr     p adj
## 2-1 -0.10741081 -0.19695290 -0.017868711 0.0116575
## 3-1  0.16345432  0.09197444  0.234934209 0.0000001
## 4-1 -0.09687715 -0.19131326 -0.002441033 0.0420049
## 3-2  0.27086513  0.18722356  0.354506698 0.0000000
## 4-2  0.01053366 -0.09341209  0.114479403 0.9935825
## 4-3 -0.26033147 -0.34919263 -0.171470315 0.0000000
# Gini
gini_BH_anova <- aov(gini ~ BH_cluster, data = df)
summary(gini_BH_anova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## BH_cluster    3   1089   362.9   7.215 0.000149 ***
## Residuals   149   7495    50.3                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gini_BH_posthoc <- TukeyHSD(gini_BH_anova)
gini_BH_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = gini ~ BH_cluster, data = df)
## 
## $BH_cluster
##          diff        lwr        upr     p adj
## 2-1 -2.687509 -7.2859809  1.9109626 0.4290286
## 3-1 -6.214127 -9.8850064 -2.5432476 0.0001206
## 4-1 -1.299048 -6.1488539  3.5507586 0.8984982
## 3-2 -3.526618 -7.8220656  0.7688299 0.1474059
## 4-2  1.388462 -3.9497157  6.7266388 0.9061148
## 4-3  4.915079  0.3515775  9.4785813 0.0293654
# calculate mean and standard deviation for CAT scores within each cluster
catBH_summary <- df %>%
  group_by(BH_cluster) %>%
  summarize(mean_CAT = mean(CAT),
            sd_CAT = sd(CAT),
            n = n()) %>%
  mutate(se = sd_CAT / sqrt(n),
         ci_lower = mean_CAT - qt(0.975, n - 1) * se,
         ci_upper = mean_CAT + qt(0.975, n - 1) * se)
catBH_summary
# create jittered strip plot with mean points and error bars
catBH_plot <- ggplot(df, aes(x = BH_cluster, y = CAT, color = BH_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) +  
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) + 
  labs(x = "Cluster", y = "CAT", title = "CAT Scores by BH Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +  
  theme_minimal() +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
catBH_plot

# calculate mean and standard deviation for YOS within each cluster
yosBH_summary <- df %>%
  group_by(BH_cluster) %>%
  summarize(mean_YOS = mean(YOS),
            sd_YOS = sd(YOS))
yosBH_summary
# create jittered strip plot with mean lines
yosBH_plot <- ggplot(df, aes(x = BH_cluster, y = YOS, color = BH_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) +   
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +   
  labs(x = "Cluster", y = "YOS", title = "YOS Scores by BH Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
yosBH_plot

# calculate mean and standard deviation for 1-COR within each cluster
corBH_summary <- df %>%
  group_by(BH_cluster) %>%
  summarize(mean_1_COR = mean(1-COR),
            sd_1_COR = sd(1-COR))
corBH_summary
# create jittered strip plot with mean lines
corBH_plot <- ggplot(df, aes(x = BH_cluster, y = 1-COR, color = BH_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) + 
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) + 
  labs(x = "Cluster", y = "1-COR", title = "1-COR Scores by BH Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
corBH_plot

# calculate mean and standard deviation for 1-BETA within each cluster
betaBH_summary <- df %>%
  group_by(BH_cluster) %>%
  summarize(mean_1_BETA = mean(1-BETA),
            sd_1_BETA = sd(1-BETA))
betaBH_summary
# create the strip plot with mean lines
betaBH_plot <- ggplot(df, aes(x = BH_cluster, y = 1-BETA, color = BH_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) + 
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +   
  labs(x = "Cluster", y = "1-BETA", title = "1-BETA Scores by BH Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1))   
betaBH_plot

# calculate mean and standard deviation for gini index within each cluster
gini_BH_summary <- df %>%
  group_by(BH_cluster) %>%
  summarize(mean_gini = mean(gini),
            sd_gini = sd(gini))

gini_BH_summary
# create the strip plot with mean lines
gini_BH_plot <- ggplot(df, aes(x = BH_cluster, y = gini, color = BH_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) + 
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +   
  labs(x = "Cluster", y = "Gini", title = "Gini Index by BH Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1))   
gini_BH_plot

5. K means clustering on TM variables - Validation

After running K-means clustering on the 25 BH variables, we perform 2 levels of validations:

5.1. K means clustering on TM variables

# create plot of number of clusters vs total within sum of squares
fviz_nbclust(scale(df[63:87]), kmeans, method = "wss")

# k-means with k = 4
set.seed(234)
km.res.TM <- kmeans(scale(df[63:87]), 4, nstart = 25)
km.res.TM
## K-means clustering with 4 clusters of sizes 7, 46, 50, 50
## 
## Cluster means:
##         tm11        tm12       tm13        tm14        tm15       tm21
## 1 -0.2796238 -0.59317254  2.3811044 -0.96516041 -0.65992203  0.1117685
## 2 -0.4254435  0.16585453  0.1627500  0.02491049  0.76048609 -0.4476897
## 3  1.1320839 -0.15499211 -0.7024904 -0.72311271 -0.65324955  1.0611085
## 4 -0.7015285  0.08545009  0.2194058  0.83531752  0.04599143 -0.6648815
##         tm22         tm23        tm24       tm25        tm31        tm32
## 1 -0.5568371  2.576303724 -1.36567514 -0.9837743 -0.08658153 -0.05765707
## 2 -0.1345903  0.002895838  0.02157411  0.9696476 -0.49049473 -0.28739247
## 3  0.5060023 -0.499091374 -0.78069007 -0.5089787  1.08513258  0.84120851
## 4 -0.3042220  0.135744681  0.95203641 -0.2453686 -0.62175602 -0.56873544
##         tm33       tm34       tm35       tm41        tm42       tm43       tm44
## 1  3.1520051 -1.4170509 -0.9000162 -0.3491745  0.06535315  3.2453696 -1.2627014
## 2 -0.2187402 -0.2041437  1.1118133 -0.4581438 -0.29859942 -0.1784015 -0.4617618
## 3 -0.1174763 -0.7221351 -0.3553730  1.0349993  0.89980544  0.2124526 -0.4901917
## 4 -0.1225635  1.1083344 -0.5414929 -0.5646225 -0.63424341 -0.5026750  1.0917907
##         tm45       tm51       tm52       tm53       tm54        tm55
## 1 -0.5932604 -0.4367202 -0.6303033  2.6704574 -0.9017447 -0.63070070
## 2  1.0320114 -0.4037488 -0.4645420 -0.3550004 -0.5763151  0.90772260
## 3 -0.5587157  0.9587524  0.9905800  0.4460539 -0.2455907 -0.78966772
## 4 -0.3076783 -0.5261628 -0.4749588 -0.4933176  0.9020448  0.04286102
## 
## Clustering vector:
##   [1] 3 2 2 4 4 2 4 4 2 4 2 3 3 2 4 1 2 4 3 3 3 2 3 2 3 3 2 3 2 3 3 1 3 3 4 2 4
##  [38] 2 3 3 2 2 3 4 1 3 4 2 2 3 3 4 4 3 4 3 3 3 2 3 4 2 2 3 2 2 2 4 4 4 2 4 3 4
##  [75] 2 4 4 3 4 2 2 3 4 4 3 2 2 3 3 2 2 4 4 4 3 3 2 1 3 4 2 3 3 3 4 2 3 2 3 4 4
## [112] 4 4 2 4 4 3 1 3 4 4 4 4 2 4 3 2 4 2 4 4 2 4 3 2 3 3 1 3 4 1 3 2 2 4 4 4 3
## [149] 4 2 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 109.4936 537.5893 827.8046 552.5461
##  (between_SS / total_SS =  46.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# append cluster assignment to df
df <- cbind(df, TM_cluster = factor(km.res.TM$cluster))

5.2. Level 1 validation on K means clustering - PCA & visualization

# run PCA and take 2-3 1st PCs to visualize clusters
pca_TM <- prcomp(scale(df[63:87]))
summary(pca_TM)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     3.0515 2.0666 1.9912 1.5971 1.03791 0.78282 0.69863
## Proportion of Variance 0.3725 0.1708 0.1586 0.1020 0.04309 0.02451 0.01952
## Cumulative Proportion  0.3725 0.5433 0.7019 0.8039 0.84701 0.87152 0.89105
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.68239 0.62885 0.55618 0.53666 0.51012 0.45320 0.40843
## Proportion of Variance 0.01863 0.01582 0.01237 0.01152 0.01041 0.00822 0.00667
## Cumulative Proportion  0.90967 0.92549 0.93786 0.94938 0.95979 0.96801 0.97468
##                           PC15    PC16    PC17    PC18   PC19    PC20      PC21
## Standard deviation     0.40519 0.37007 0.34944 0.30308 0.2644 0.21897 1.378e-08
## Proportion of Variance 0.00657 0.00548 0.00488 0.00367 0.0028 0.00192 0.000e+00
## Cumulative Proportion  0.98125 0.98673 0.99161 0.99529 0.9981 1.00000 1.000e+00
##                             PC22      PC23      PC24      PC25
## Standard deviation     1.159e-08 1.051e-08 8.882e-09 8.357e-09
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00
# 1st 3 PCs make up 70.2% cumulative proportion

# plot 4 TM clusters using 1st 3 PCs
pca_TM_df <- data.frame(PC1 = pca_TM$x[,1],  
                     PC2 = pca_TM$x[,2],
                     PC3 = pca_TM$x[,3],
                     TM_cluster = df$TM_cluster,
                     country = df$country)



 TM_three_PCs <- plot_ly(data = pca_TM_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~factor(TM_cluster),
        type = "scatter3d", mode = "markers", marker = list(size = 5)) %>%
  layout(scene = list(xaxis = list(title = "Principle Component 1"),
                      yaxis = list(title = "Principle Component 2"),
                      zaxis = list(title = "Principle Component 3")),
         title = "3D Scatter Plot of 3PCs on TM Clusters")
TM_three_PCs

5.3. Level 2 validation on K means clustering - Descriptive using ANOVA and Posthocs

# CAT
cat_TM_anova <- aov(CAT ~ TM_cluster, data = df)
summary(cat_TM_anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## TM_cluster    3  3.144  1.0480   67.51 <2e-16 ***
## Residuals   149  2.313  0.0155                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
catTM_posthoc <- TukeyHSD(cat_TM_anova)
catTM_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = CAT ~ TM_cluster, data = df)
## 
## $TM_cluster
##            diff         lwr         upr     p adj
## 2-1  0.16197756  0.03064071  0.29331440 0.0088949
## 3-1 -0.18401231 -0.31465346 -0.05337117 0.0019659
## 4-1  0.07321327 -0.05742787  0.20385442 0.4666712
## 3-2 -0.34598987 -0.41212745 -0.27985229 0.0000000
## 4-2 -0.08876429 -0.15490186 -0.02262671 0.0035546
## 4-3  0.25722559  0.19248054  0.32197064 0.0000000
# YOS
yos_TM_anova <- aov(YOS ~ TM_cluster, data = df)
summary(yos_TM_anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## TM_cluster    3  1.937  0.6457   37.71 <2e-16 ***
## Residuals   149  2.551  0.0171                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yosTM_posthoc <- TukeyHSD(yos_TM_anova)
yosTM_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = YOS ~ TM_cluster, data = df)
## 
## $TM_cluster
##            diff         lwr         upr     p adj
## 2-1  0.11364584 -0.02429643  0.25158811 0.1451455
## 3-1 -0.16437960 -0.30159118 -0.02716802 0.0118019
## 4-1  0.01936328 -0.11784830  0.15657486 0.9830921
## 3-2 -0.27802544 -0.34748932 -0.20856156 0.0000000
## 4-2 -0.09428255 -0.16374644 -0.02481867 0.0031114
## 4-3  0.18374289  0.11574156  0.25174421 0.0000000
# 1-COR
cor_TM_anova <- aov(1-COR ~ TM_cluster, data = df)
summary(cor_TM_anova)
##              Df Sum Sq  Mean Sq F value Pr(>F)
## TM_cluster    3 0.0271 0.009028   0.792    0.5
## Residuals   149 1.6982 0.011397
corTM_posthoc <- TukeyHSD(cor_TM_anova)
corTM_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = 1 - COR ~ TM_cluster, data = df)
## 
## $TM_cluster
##              diff         lwr        upr     p adj
## 2-1 -0.0014503713 -0.11398628 0.11108554 0.9999865
## 3-1  0.0271851557 -0.08475464 0.13912495 0.9219128
## 4-1 -0.0010821991 -0.11302200 0.11085760 0.9999943
## 3-2  0.0286355270 -0.02803442 0.08530548 0.5561888
## 4-2  0.0003681721 -0.05630178 0.05703812 0.9999983
## 4-3 -0.0282673549 -0.08374412 0.02720941 0.5492205
# 1-BETA
beta_TM_anova <- aov(1-BETA ~ TM_cluster, data = df)
summary(beta_TM_anova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## TM_cluster    3  0.932 0.31065   11.84 5.36e-07 ***
## Residuals   149  3.911 0.02625                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
betaTM_posthoc <- TukeyHSD(beta_TM_anova)
betaTM_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = 1 - BETA ~ TM_cluster, data = df)
## 
## $TM_cluster
##            diff         lwr         upr     p adj
## 2-1 -0.11122661 -0.28200302  0.05954979 0.3313954
## 3-1 -0.22554284 -0.39541463 -0.05567105 0.0040286
## 4-1 -0.04631929 -0.21619109  0.12355250 0.8936102
## 3-2 -0.11431623 -0.20031447 -0.02831799 0.0039744
## 4-2  0.06490732 -0.02109092  0.15090556 0.2075414
## 4-3  0.17922355  0.09503600  0.26341110 0.0000008
# Gini
gini_TM_anova <- aov(gini ~ TM_cluster, data = df)
summary(gini_TM_anova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## TM_cluster    3   1454   484.8   10.13 4.09e-06 ***
## Residuals   149   7129    47.8                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
giniTM_posthoc <- TukeyHSD(gini_TM_anova)
giniTM_posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = gini ~ TM_cluster, data = df)
## 
## $TM_cluster
##           diff         lwr       upr     p adj
## 2-1  -9.243727 -16.5353003 -1.952153 0.0067033
## 3-1  -4.772857 -12.0258069  2.480093 0.3222284
## 4-1 -10.884857 -18.1378069 -3.631907 0.0008309
## 3-2   4.470870   0.7990362  8.142703 0.0100952
## 4-2  -1.641130  -5.3129638  2.030703 0.6521365
## 4-3  -6.112000  -9.7065229 -2.517477 0.0001114
# calculate mean and standard deviation for CAT scores within each cluster
catTM_summary <- df %>%
  group_by(TM_cluster) %>%
  summarize(mean_CAT = mean(CAT),
            sd_CAT = sd(CAT),
            n = n()) %>%
  mutate(se = sd_CAT / sqrt(n),
         ci_lower = mean_CAT - qt(0.975, n - 1) * se,
         ci_upper = mean_CAT + qt(0.975, n - 1) * se)
catTM_summary
# create jittered strip plot with mean points and error bars
catTM_plot <- ggplot(df, aes(x = factor(TM_cluster), y = CAT, color = TM_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) +  
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) + 
  labs(x = "Cluster", y = "CAT", title = "TM CAT Scores by Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +  
  theme_minimal() +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
catTM_plot

# calculate mean and standard deviation for YOS within each cluster
yosTM_summary <- df %>%
  group_by(TM_cluster) %>%
  summarize(mean_YOS = mean(YOS),
            sd_YOS = sd(YOS))
yosTM_summary
# create jittered strip plot with mean lines
yos_plot <- ggplot(df, aes(x = TM_cluster, y = YOS, color = TM_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) +   
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +   
  labs(x = "Cluster", y = "YOS", title = "YOS Scores by Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
yos_plot

# calculate mean and standard deviation for 1-COR within each cluster
corTM_summary <- df %>%
  group_by(TM_cluster) %>%
  summarize(mean_1_COR = mean(1-COR),
            sd_1_COR = sd(1-COR))
corTM_summary
# create jittered strip plot with mean lines
corTM_plot <- ggplot(df, aes(x = TM_cluster, y = 1-COR, color = TM_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) + 
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) + 
  labs(x = "Cluster", y = "1-COR", title = "1-COR Scores by Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
corTM_plot

# calculate mean and standard deviation for 1-BETA within each cluster
betaTM_summary <- df %>%
  group_by(TM_cluster) %>%
  summarize(mean_1_BETA = mean(1-BETA),
            sd_1_BETA = sd(1-BETA))
betaTM_summary
# create jittered strip plot with mean lines
betaTM_plot <- ggplot(df, aes(x = TM_cluster, y = 1-BETA, color = TM_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) + 
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +   
  labs(x = "Cluster", y = "1-BETA", title = "1-BETA Scores by Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1))   
betaTM_plot

# calculate mean and standard deviation for gini index within each cluster
giniTM_summary <- df %>%
  group_by(TM_cluster) %>%
  summarize(mean_gini = mean(gini),
            sd_gini = sd(gini))

giniTM_summary
# create the strip plot with mean lines
giniTM_plot <- ggplot(df, aes(x = TM_cluster, y = gini, color = TM_cluster)) +
  geom_jitter(width = 0.2, alpha = 0.7) + 
  stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +   
  labs(x = "Cluster", y = "Gini", title = "Gini Index by TM Cluster") +
  scale_color_manual(values = c("red", "green", "blue", "orange")) +   
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1))   
giniTM_plot

6. Comparing 2 K means clustering results

We’re creating a matrix to see how BH clusters and TM clusters overlap with each other.

# we're using a confusion matrix as it uses the sample principle
confusionMatrix(df$BH_cluster, df$TM_cluster)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  4 15  3 20
##          2  0  3 23  0
##          3  3 20 12 28
##          4  0  8 12  2
## 
## Overall Statistics
##                                          
##                Accuracy : 0.1373         
##                  95% CI : (0.087, 0.2021)
##     No Information Rate : 0.3268         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.143         
##                                          
##  Mcnemar's Test P-Value : 5.63e-09       
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.57143  0.06522  0.24000  0.04000
## Specificity           0.73973  0.78505  0.50485  0.80583
## Pos Pred Value        0.09524  0.11538  0.19048  0.09091
## Neg Pred Value        0.97297  0.66142  0.57778  0.63359
## Prevalence            0.04575  0.30065  0.32680  0.32680
## Detection Rate        0.02614  0.01961  0.07843  0.01307
## Detection Prevalence  0.27451  0.16993  0.41176  0.14379
## Balanced Accuracy     0.65558  0.42513  0.37243  0.42291